About

In this markdown, we will do a comparison of the transcriptomes during development of Ptychodera, Amphixous, and Purple Sea Urchin.

For this, we will use a solution of custom R functions and wrappers that we have named “comparABle” (kudos to @agilgal for the name!).

Load libraries

library(tidyverse)
library(stringr)
library(BiocGenerics)
library(EDASeq)
library(DESeq2)
library(tximport)
library(limma)
library(ComplexHeatmap)
library(circlize)
library(RColorBrewer)
require(data.table)
require(stringr)
require(tidytree)
require(ggtree)
require(rvcheck)
require(treeio)
require(dendextend)
require(phangorn)
require(phytools)
require(ComplexHeatmap)
require(foreach)
require(doParallel)
require(colorspace)
library(topGO)

Load functions

source("code/r_code/r_functions/sourcefolder.R")

sourceFolder(
  "code/r_code/r_functions",
  recursive = TRUE
  )

sourceFolder(
  "code/r_code/r_general",
  recursive = TRUE
  )

Load Data

We will first load the data of our three species:

# Ptychodera analyses
load("outputs/rda/deseq2.rda")
# Amphioxus reanalysis
load("outputs/rda/blan_reanalysis.rda")
# Purple Sea Urchin reanalysis
load("outputs/rda/spur_reanalysis.rda")

We will follow by loading all the gene family data that we need. This is the gene/gfam lookup table (extracted from the OrthoXML output file from OMA) and the respective table to transform the gene indexes into the actual gene id of every species.

gene_gfam <- read.delim2("outputs/comparative/20240404_oma/oma_omaid_gfam.tsv", header = TRUE)
colnames(gene_gfam) <- c("id","gfam")
gene_gfam$id <- gsub("\\.p[0-9]+","",gene_gfam$id)

This gene_gfam file will be used later on as proxy to compare developmental stages.

head(gene_gfam)
##           id     gfam
## 1 CAEEL00007 HOG00001
## 2 SCHMD09491 HOG00001
## 3 SCHMD28687 HOG00002
## 4 SCHMD02140 HOG00002
## 5 CAEEL00052 HOG00002
## 6 CAEEL00105 HOG00003
oma_pfla_blan <- read.delim2("outputs/comparative/20240404_oma/one_to_one_orthologues/1to1_pfla_blan.tsv", header = FALSE)
oma_pfla_blan$V2 <- gsub(" ","",oma_pfla_blan$V2)
oma_pfla_spur <- read.delim2("outputs/comparative/20240404_oma/one_to_one_orthologues/1to1_pfla_spur.tsv", header = FALSE)
oma_pfla_spur$V2 <- gsub(" ","",oma_pfla_spur$V2)
load("outputs/rda/geneage.rda")

For more functional annotation, we will use the COG functional categories and the GO terms of Ptychodera.

pfla_cogs <- read.table(
  "outputs/functional_annotation/COGs/pfla_cogs.tsv",
  col.names = c("id","cog")
)

# GO terms
pfla_id2go <- 
  readMappings(
    "outputs/functional_annotation/go_blast2go/GO_annotation.txt"
  )

Ptychodera and Amphioxus

We will first compare the transcriptomes and stage-specific clusters of Ptychodera and Amphioxus. For this, we will define a number of objects that will enter into our custom wrapper function comparABle. The most important of them being:

In addition, we will also pass it a list of gene ontologies, a COG functional category association file, and a list of gene ages that are shared between species A and B.

# Expression data
a = pfla_rna_counts
b = blan_counts

# Samples for rowmeans_by repl
a_samples = levels(condition_x)
b_samples = unique(sub("_.$", "", colnames(b)))

# Family/orthology data
o = unique(oma_pfla_blan)
f = gene_gfam[grep("TCONS|BRALA",gene_gfam$id),]

# Module/Cluster information
ma = data.frame(
  id = rownames(pfla_rna_dev),
  module = pfla_rna_dev$cID
)

mb = blan_cl
colnames(mb) <- c("id","module")

# Gene Age
ga = pfla_age[,c(1,3)]
colnames(ga) = c("id","age")

# COG
cog_a <- pfla_cogs

# GOs
a_universe = rownames(vsd_allgen)
a_id2go = pfla_id2go

# Common Evo Nodes
common_evo_nodes = unique(ga$age)[!(unique(ga$age) %in% c("6_ambulacraria","7_hemichordata","8_Pfla"))]

Below we will run the comparABle wrapper. Briefly, what this wrapper does is:

This can take some time.

PFLA_BLAN_COMPARISON <- comparABle(
  a_name = "P.flava",
  b_name = "B.lanceolatum",
  a = a,
  b = b,
  o = o,
  f = f,
  ma = ma,
  mb = mb,
  ga = ga,
  gb = gb,
  cog_a = cog_a,
  cog_b = cog_b,
  a_samples = a_samples,
  b_samples = b_samples,
  highlyvariable = FALSE,
  cooc_p = 0.05,
  cooc_h = c(0.70,0.95),
  cooc_cor_method = "pearson",
  a_universe = a_universe,
  a_id2go = a_id2go,
  common_evo_nodes = common_evo_nodes,
  sep = ",\ "
)
## [1] "Tidy up data"
## [1] "Merge data"
## [1] "Correlations"
## [1] "PCA"
## [1] "Co-Occurrence"
## [1] "Common genes in Correlations"
## The top five similar pair of stages are: 16.60773 16.80909 17.04698 17.11295 17.20339 
## [1] "Subsetting count matrices"
## [1] "Model fitting and top genes"
## [1] "Common genes in Correlations (GO)"
## [1] "Starting analysis 1 of 5"
## [1] "Starting analysis 2 of 5"
## [1] "Starting analysis 3 of 5"
## [1] "Starting analysis 4 of 5"
## [1] "Starting analysis 5 of 5"
## [1] "Common genes in Correlations (age)"
## [1] "Pairwise Orthology Overlap Strategy across modules -- hypergeometric and binonmial tests"
## [1] "Getting info on genes from shared families across modules"
## [1] "Starting analysis 1 of 20"
## [1] "Starting analysis 2 of 20"
## [1] "Starting analysis 3 of 20"
## [1] "Starting analysis 4 of 20"
## [1] "Starting analysis 5 of 20"
## [1] "Starting analysis 6 of 20"
## [1] "Starting analysis 7 of 20"
## [1] "Starting analysis 8 of 20"
## [1] "Starting analysis 9 of 20"
## [1] "Starting analysis 10 of 20"
## [1] "Starting analysis 11 of 20"
## [1] "Starting analysis 12 of 20"
## [1] "Starting analysis 13 of 20"
## [1] "Starting analysis 14 of 20"
## [1] "Starting analysis 15 of 20"
## [1] "Starting analysis 16 of 20"
## [1] "Starting analysis 17 of 20"
## [1] "Starting analysis 18 of 20"
## [1] "Starting analysis 19 of 20"
## [1] "Starting analysis 20 of 20"
## [1] "Plots"
## [1] "Generating results"

After running this we can start exploring the comparisons. First we will tidy up the pairwise correlation data and add names to the rows and columns of the jensen-shannon distance matrix.

Here are the heatmaps of pairwise correlations between Ptychodera and Amphioxus.

plot_cors(PFLA_BLAN_COMPARISON$pairwise_correlations)

A quick sanity check that this is consistently observed regardles of genes by doing some bootstraping and checkin the mean observed JSD:

js_mean_pfla_blan <- jsd_with_subsampling(
  a_o = PFLA_BLAN_COMPARISON$merged_data$a_o , 
  b_o = PFLA_BLAN_COMPARISON$merged_data$b_o, 
  n = 1000, p = 0.25
)

#relativise to min and max values
js_mean_rel <- relativise(js_mean_pfla_blan$mean)

h3_avg_rel <- Heatmap(js_mean_rel,cluster_rows = F, cluster_columns = F, show_row_names = TRUE, name = "JSD", col = brewer.pal(10,"RdBu"))
draw(h3_avg_rel)

Here is the average expression profile of genes more expressed in the most similar stages between Ptychodera an Amphioxus, including GO terms and age enrichment:

# Common genes, highly correlated
p1 <- plot_hicor_genes(
  scatter = PFLA_BLAN_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes$cor_05_MG__08_18h$plot_topgenes,
  GO_plot = PFLA_BLAN_COMPARISON$high_corr_genes$GOs$GOplot$cor_05_MG__08_18h,
  age_table = PFLA_BLAN_COMPARISON$high_corr_genes$age$cor_05_MG__08_18h$AgeperModule,
  age_enrichemnt_hm = PFLA_BLAN_COMPARISON$high_corr_genes$age$cor_05_MG__08_18h$heatmap
)
p2 <- plot_hicor_genes(
  scatter = PFLA_BLAN_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes$cor_04_EG__08_18h$plot_topgenes,
  GO_plot = PFLA_BLAN_COMPARISON$high_corr_genes$GOs$GOplot$cor_04_EG__08_18h,
  age_table = PFLA_BLAN_COMPARISON$high_corr_genes$age$cor_04_EG__08_18h$AgeperModule,
  age_enrichemnt_hm = PFLA_BLAN_COMPARISON$high_corr_genes$age$cor_04_EG__08_18h$heatmap
)
p3 <- plot_hicor_genes(
  scatter = PFLA_BLAN_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes$cor_10_Me__14_Premet$plot_topgenes,
  GO_plot = PFLA_BLAN_COMPARISON$high_corr_genes$GOs$GOplot$cor_10_Me__14_Premet,
  age_table = PFLA_BLAN_COMPARISON$high_corr_genes$age$cor_10_Me__14_Premet$AgeperModule,
  age_enrichemnt_hm = PFLA_BLAN_COMPARISON$high_corr_genes$age$cor_10_Me__14_Premet$heatmap
)

plot_grid(p1,p2,p3,ncol = 1)

Here is the co-occurrence analysis showcasing the Amphioxus stages most similar to Ptychodera gastrulation are the stages corresponding to neurulation.

Heatmap(
  name = "co-occurrence",
  PFLA_BLAN_COMPARISON$coocurrence_analysis$cooccurrence,
  cluster_rows = PFLA_BLAN_COMPARISON$coocurrence_analysis$tree,
  cluster_columns = PFLA_BLAN_COMPARISON$coocurrence_analysis$tree,
  col = sequential_hcl(10,"YlOrRd", rev = TRUE)
)

Heatmap(
  name = "co-occurrence",
  PFLA_BLAN_COMPARISON$coocurrence_analysis$cooccurrence[1:16,17:28],
  cluster_rows = F,
  cluster_columns = F,
  col = sequential_hcl(10,"YlOrRd", rev = TRUE)
)

Here the orthology overlap strategy showcasing similarities at the gene family usage between hemichordate larval stages and amphioxus post-gastrulation development

pf_avg_hm+
PFLA_BLAN_COMPARISON$plots$orthology_overlap_binomial_hm+
PFLA_BLAN_COMPARISON$plots$orthology_overlap_hypgeom_hm

Here is a quick overview of the stats of the hypergeometric and binomial tests for pairwise comparisons:

summary(PFLA_BLAN_COMPARISON$orthology_overlap_modules$pairwise_module_comparison$stats)
##    module_a           module_b         success_in_samples sample_size       
##  Length:544         Length:544         Length:544         Length:544        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  success_in_pop       gene_POP          hypgeom_pval      hypgeom_log      
##  Length:544         Length:544         Min.   :0.00000   Min.   :  0.0000  
##  Class :character   Class :character   1st Qu.:0.05191   1st Qu.:  0.1462  
##  Mode  :character   Mode  :character   Median :0.42590   Median :  0.8536  
##                                        Mean   :0.45954   Mean   :  2.4758  
##                                        3rd Qu.:0.86401   3rd Qu.:  2.9582  
##                                        Max.   :1.00000   Max.   :121.9379  
##    binom_pval      gfams_common       gfams_excl_a       gfams_excl_b      
##  Min.   :0.00000   Length:544         Length:544         Length:544        
##  1st Qu.:0.01856   Class :character   Class :character   Class :character  
##  Median :0.20632   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.33077                                                           
##  3rd Qu.:0.60450                                                           
##  Max.   :1.00000

These is a quick overview at the number of genes in enriched gene families between pairs of comparisons:

# Group by module and summarize
commongenes <- PFLA_BLAN_COMPARISON$orthology_overlap_modules$
  genes_in_common_fams$commonfams$table_a_common %>%
  dplyr::group_by(module) %>%
  dplyr::summarize(
    numgenes = n(),
    genes = paste(ifelse(n() > 3, paste0(paste(head(id, 3), collapse = ",")," ..."), paste(id, collapse = ", ")), collapse = ", ")
  )

# Rename columns
colnames(commongenes) <- c("module", "numgenes", "genes")

commongenes
## # A tibble: 20 × 3
##    module numgenes genes                                           
##    <chr>     <int> <chr>                                           
##  1 01__3        69 TCONS_00055592,TCONS_00055656,TCONS_00055961 ...
##  2 02__2        43 TCONS_00032521,TCONS_00024643,TCONS_00024736 ...
##  3 02__3        65 TCONS_00031538,TCONS_00022997,TCONS_00024599 ...
##  4 02__4        37 TCONS_00024188,TCONS_00055307,TCONS_00056697 ...
##  5 02__6        62 TCONS_00056311,TCONS_00055789,TCONS_00057275 ...
##  6 03__8        43 TCONS_00024709,TCONS_00024711,TCONS_00024362 ...
##  7 04__15       29 TCONS_00055090,TCONS_00055157,TCONS_00055995 ...
##  8 04__8        32 TCONS_00024714,TCONS_00024919,TCONS_00024987 ...
##  9 06__3        76 TCONS_00038493,TCONS_00038499,TCONS_00038524 ...
## 10 06__6       143 TCONS_00038727,TCONS_00023049,TCONS_00024735 ...
## 11 07__3        65 TCONS_00039102,TCONS_00054786,TCONS_00023200 ...
## 12 07__6        51 TCONS_00031847,TCONS_00023459,TCONS_00054892 ...
## 13 12__8        27 TCONS_00038771,TCONS_00031684,TCONS_00025139 ...
## 14 14__18       86 TCONS_00024858,TCONS_00023794,TCONS_00025437 ...
## 15 17__21      140 TCONS_00032533,TCONS_00023193,TCONS_00024774 ...
## 16 19__22       73 TCONS_00023644,TCONS_00025571,TCONS_00044473 ...
## 17 19__24      112 TCONS_00038643,TCONS_00009468,TCONS_00024996 ...
## 18 20__21      119 TCONS_00038772,TCONS_00024487,TCONS_00023562 ...
## 19 21__8        66 TCONS_00025412,TCONS_00024226,TCONS_00056214 ...
## 20 22__24       63 TCONS_00038759,TCONS_00025504,TCONS_00024155 ...

And here the combination of gene age and functional categories enrichment:

PFLA_BLAN_COMPARISON$orthology_overlap_modules$
  genes_in_common_fams$commonfams$
    age_a_common$heatmap +
PFLA_BLAN_COMPARISON$orthology_overlap_modules$
  genes_in_common_fams$commonfams$
    cog_a_comon$heatmap

And here the gene ontology enrichment of enriched gene families between pairs of modules.

# Add a new column with the name of the original data frame
keygenes_commonfams_GO <- 
  bind_rows(
    PFLA_BLAN_COMPARISON$orthology_overlap_modules$
      genes_in_common_fams$commonfams$go_a_common$GOtable, 
    .id = "pair_gfams"
    )

# Rename the new column
colnames(keygenes_commonfams_GO)[1] <- "pair_gfams"

keygenes_commonfams_GO$logp <- 
  -log10(as.numeric(keygenes_commonfams_GO$classicFisher))

keygenes_commonfams_GO$logp[is.na(keygenes_commonfams_GO$logp)] <- 
  max(keygenes_commonfams_GO$logp, na.rm = T)

keygenes_commonfams_GO$obsexp <- 
  as.numeric(keygenes_commonfams_GO$Significant/keygenes_commonfams_GO$Expected)

keygenes_commonfams_GO <- 
  keygenes_commonfams_GO[
    order(
      keygenes_commonfams_GO$pair_gfams,
      keygenes_commonfams_GO$Term
      ),
    ]

keygenes_commonfams_GO$Term <- 
  factor(
    keygenes_commonfams_GO$Term, 
    levels = unique(keygenes_commonfams_GO$Term)
    )

The GO plot here:

gg <- keygenes_commonfams_GO  %>% 
  ggplot(
    aes(x=pair_gfams, y = Term, size = Significant, color = logp)
    ) + geom_point() +
  theme_bw() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8 )
    )+ 
  theme(
    axis.text.y.left = element_text(hjust = 1, size = 8 ),
    legend.title=element_text(size=8 ),
    legend.text=element_text(size=8 )
    )+ 
  scale_size_continuous(
    name = "Genes affected",
    range = c(2,6)
    ) + scale_color_viridis("-log10(P-value)") +
  xlab("")+ylab("")

gg

Ptychodera and Purple Sea Urchin

We will proceed the same as before using sea urchin in this case. Again, we define all the data necessary for the comparABle wrapper:

# Expression data
a = pfla_rna_counts
b = spur_vsd

# Samples for rowmeans_by repl
a_samples = levels(condition_x)
b_samples = unique(sub("_.$", "", colnames(b)))

# Family/orthology data
o = unique(oma_pfla_spur)
f = gene_gfam[grep("TCONS|STRPU",gene_gfam$id),]

# Module/Cluster information
ma = data.frame(
  id = rownames(pfla_rna_dev),
  module = pfla_rna_dev$cID
)

mb = spur_cl
colnames(mb) <- c("id","module")

# Gene Age
ga = pfla_age[,c(1,3)]
colnames(ga) = c("id","age")

# COG
cog_a <- pfla_cogs

# GOs
a_universe = rownames(vsd_allgen)
a_id2go = pfla_id2go

# Common Evo Nodes
common_evo_nodes = unique(ga$age)[!(unique(ga$age) %in% c("7_hemichordata","8_Pfla"))]

And we run comparABle:

PFLA_SPUR_COMPARISON <- comparABle(
  a_name = "P.flava",
  b_name = "S.purpuratus",
  a = a,
  b = b,
  o = o,
  f = f,
  ma = ma,
  mb = mb,
  ga = ga,
  gb = gb,
  cog_a = cog_a,
  cog_b = cog_b,
  a_samples = a_samples,
  b_samples = b_samples,
  highlyvariable = FALSE,
  cooc_p = 0.25,
  cooc_h = c(0.6,0.9),
  cooc_n = 10000,
  cooc_cor_method = "pearson",
  a_universe = a_universe,
  a_id2go = a_id2go,
  common_evo_nodes = common_evo_nodes,
  sep = ",\ "
)
## [1] "Tidy up data"
## [1] "Merge data"
## [1] "Correlations"
## [1] "PCA"
## [1] "Co-Occurrence"
## [1] "Common genes in Correlations"
## The top five similar pair of stages are: 17.16678 17.32802 17.74724 17.84709 17.96809 
## [1] "Subsetting count matrices"
## [1] "Model fitting and top genes"
## [1] "Common genes in Correlations (GO)"
## [1] "Starting analysis 1 of 5"
## [1] "Starting analysis 2 of 5"
## [1] "Starting analysis 3 of 5"
## [1] "Starting analysis 4 of 5"
## [1] "Starting analysis 5 of 5"
## [1] "Common genes in Correlations (age)"
## [1] "Pairwise Orthology Overlap Strategy across modules -- hypergeometric and binonmial tests"
## [1] "Getting info on genes from shared families across modules"
## [1] "Starting analysis 1 of 20"
## [1] "Starting analysis 2 of 20"
## [1] "Starting analysis 3 of 20"
## [1] "Starting analysis 4 of 20"
## [1] "Starting analysis 5 of 20"
## [1] "Starting analysis 6 of 20"
## [1] "Starting analysis 7 of 20"
## [1] "Starting analysis 8 of 20"
## [1] "Starting analysis 9 of 20"
## [1] "Starting analysis 10 of 20"
## [1] "Starting analysis 11 of 20"
## [1] "Starting analysis 12 of 20"
## [1] "Starting analysis 13 of 20"
## [1] "Starting analysis 14 of 20"
## [1] "Starting analysis 15 of 20"
## [1] "Starting analysis 16 of 20"
## [1] "Starting analysis 17 of 20"
## [1] "Starting analysis 18 of 20"
## [1] "Starting analysis 19 of 20"
## [1] "Starting analysis 20 of 20"
## [1] "Plots"
## [1] "Generating results"

The heatmaps of pairwise correlations and jensen-shannon divergence between Ptychodera and Sea Urchin:

plot_cors(PFLA_SPUR_COMPARISON$pairwise_correlations)

A quick sanity check that this is consistently observed regardles of genes by doing some bootstraping and checkin the mean observed JSD:

js_mean_pfla_spur <- jsd_with_subsampling(
  a_o = PFLA_SPUR_COMPARISON$merged_data$a_o , 
  b_o = PFLA_SPUR_COMPARISON$merged_data$b_o, 
  n = 1000, p = 0.25
)

#relativise to min and max values
js_mean_rel <- relativise(js_mean_pfla_spur$mean)

h3_avg_rel <- Heatmap(js_mean_rel,cluster_rows = F, cluster_columns = F, show_row_names = TRUE, name = "JSD", col = brewer.pal(10,"RdBu"))
draw(h3_avg_rel)

Here is the average expression profile of genes more expressed in the most similar stages between Ptychodera an Sea Urchin, including GO terms and age enrichment:

# Common genes, highly correlated
p1 <- plot_hicor_genes(
  scatter = PFLA_SPUR_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes[[1]]$plot_topgenes,
  GO_plot = PFLA_SPUR_COMPARISON$high_corr_genes$GOs$GOplot[[1]],
  age_table = PFLA_SPUR_COMPARISON$high_corr_genes$age[[1]]$AgeperModule,
  age_enrichemnt_hm = PFLA_SPUR_COMPARISON$high_corr_genes$age[[1]]$heatmap
)
p2 <- plot_hicor_genes(
  scatter = PFLA_SPUR_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes[[2]]$plot_topgenes,
  GO_plot = PFLA_SPUR_COMPARISON$high_corr_genes$GOs$GOplot[[2]],
  age_table = PFLA_SPUR_COMPARISON$high_corr_genes$age[[2]]$AgeperModule,
  age_enrichemnt_hm = PFLA_SPUR_COMPARISON$high_corr_genes$age[[2]]$heatmap
)
p3 <- plot_hicor_genes(
  scatter = PFLA_SPUR_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes[[3]]$plot_topgenes,
  GO_plot = PFLA_SPUR_COMPARISON$high_corr_genes$GOs$GOplot[[3]],
  age_table = PFLA_SPUR_COMPARISON$high_corr_genes$age[[3]]$AgeperModule,
  age_enrichemnt_hm = PFLA_SPUR_COMPARISON$high_corr_genes$age[[3]]$heatmap
)

plot_grid(p1,p2,p3,ncol = 1)

And the co-ocurrence matrix shocasing ptychodera gastrulation is similar to sea urchin gastrulation as well as similarities between larval stages

Heatmap(
  name = "co-occurrence",
  PFLA_SPUR_COMPARISON$coocurrence_analysis$cooccurrence,
  cluster_rows = PFLA_SPUR_COMPARISON$coocurrence_analysis$tree,
  cluster_columns = PFLA_SPUR_COMPARISON$coocurrence_analysis$tree,
  col = sequential_hcl(10,"YlOrRd", rev = TRUE)
)

Heatmap(
  name = "co-occurrence",
  PFLA_SPUR_COMPARISON$coocurrence_analysis$cooccurrence[1:16,17:33],
  cluster_rows = F,
  cluster_columns = F,
  col = sequential_hcl(10,"YlOrRd", rev = TRUE)
)

Below the orthology overlap strategy showcasing similarities at the gene family usage between species:

pf_avg_hm+
PFLA_SPUR_COMPARISON$plots$orthology_overlap_binomial_hm+
PFLA_SPUR_COMPARISON$plots$orthology_overlap_hypgeom_hm

An overview of the stats of these tests:

str(PFLA_SPUR_COMPARISON$orthology_overlap_modules$pairwise_module_comparison$stats)
## 'data.frame':    539 obs. of  12 variables:
##  $ module_a          : chr  "01" "01" "01" "01" ...
##  $ module_b          : chr  "1" "2" "3" "4" ...
##  $ success_in_samples: chr  "15" "3" "1" "5" ...
##  $ sample_size       : chr  "1216" "1216" "1216" "1216" ...
##  $ success_in_pop    : chr  "228" "142" "54" "127" ...
##  $ gene_POP          : chr  "21714" "21714" "21714" "21714" ...
##  $ hypgeom_pval      : num  0.297 0.988 0.956 0.845 0.952 ...
##  $ hypgeom_log       : num  1.2137 0.0123 0.0454 0.1678 0.0489 ...
##  $ binom_pval        : num  0.481 0.104 0.384 0.571 0.272 ...
##  $ gfams_common      : chr  "HOG55451, HOG40727, HOG40201, HOG43314, HOG48918, HOG39060, HOG31265, HOG40645, HOG54915, HOG34305, HOG44090, H"| __truncated__ "HOG47563, HOG31748, HOG44090" "HOG53555" "HOG33074, HOG53887, HOG31380, HOG45441, HOG31512" ...
##  $ gfams_excl_a      : chr  "HOG54246, HOG16235, HOG42583, HOG17003, HOG41206, HOG34685, HOG48839, HOG16220, HOG16218, HOG49524, HOG42536, H"| __truncated__ "HOG54246, HOG16235, HOG42583, HOG17003, HOG41206, HOG34685, HOG48839, HOG16220, HOG16218, HOG49524, HOG42536, H"| __truncated__ "HOG54246, HOG16235, HOG42583, HOG17003, HOG41206, HOG34685, HOG48839, HOG16220, HOG16218, HOG49524, HOG42536, H"| __truncated__ "HOG54246, HOG16235, HOG42583, HOG17003, HOG41206, HOG34685, HOG48839, HOG16220, HOG16218, HOG49524, HOG42536, H"| __truncated__ ...
##  $ gfams_excl_b      : chr  "HOG32296, HOG36743, HOG23894, HOG48192, HOG48380, HOG42484, HOG44862, HOG24516, HOG40849, HOG39220, HOG54725, H"| __truncated__ "HOG26804, HOG23791, HOG51496, HOG56811, HOG30044, HOG44718, HOG23773, HOG27323, HOG54668, HOG57047, HOG25257, H"| __truncated__ "HOG24688, HOG54145, HOG27339, HOG25126, HOG23905, HOG27330, HOG29443, HOG53794, HOG26814, HOG43970, HOG55829, H"| __truncated__ "HOG48649, HOG40947, HOG25165, HOG38585, HOG27162, HOG24564, HOG27299, HOG32065, HOG29357, HOG39830, HOG25222, H"| __truncated__ ...

A look at the number of genes belonging to enriched gene families between comparisons

# Group by module and summarize
commongenes <- PFLA_SPUR_COMPARISON$orthology_overlap_modules$
  genes_in_common_fams$commonfams$table_a_common %>%
  dplyr::group_by(module) %>%
  dplyr::summarize(
    numgenes = n(),
    genes = paste(ifelse(n() > 3, paste0(paste(head(id, 3), collapse = ",")," ..."), paste(id, collapse = ", ")), collapse = ", ")
  )

# Rename columns
colnames(commongenes) <- c("module", "numgenes", "genes")

commongenes
## # A tibble: 20 × 3
##    module numgenes genes                                           
##    <chr>     <int> <chr>                                           
##  1 06__1        40 TCONS_00038727,TCONS_00023138,TCONS_00056203 ...
##  2 08__7        17 TCONS_00056765,TCONS_00009854,TCONS_00018319 ...
##  3 14__16       98 TCONS_00023217,TCONS_00024858,TCONS_00025437 ...
##  4 15__15       31 TCONS_00023592,TCONS_00023914,TCONS_00057045 ...
##  5 15__21       39 TCONS_00032648,TCONS_00023914,TCONS_00055872 ...
##  6 15__23       45 TCONS_00032648,TCONS_00025327,TCONS_00024232 ...
##  7 17__15       52 TCONS_00023193,TCONS_00024774,TCONS_00024813 ...
##  8 17__16       53 TCONS_00032297,TCONS_00057227,TCONS_00010868 ...
##  9 17__21       64 TCONS_00032270,TCONS_00024774,TCONS_00024813 ...
## 10 17__23       64 TCONS_00032297,TCONS_00032533,TCONS_00023193 ...
## 11 19__15       49 TCONS_00023644,TCONS_00025571,TCONS_00054896 ...
## 12 19__21       80 TCONS_00024996,TCONS_00023644,TCONS_00025571 ...
## 13 19__23       75 TCONS_00024996,TCONS_00023644,TCONS_00025571 ...
## 14 19__24       99 TCONS_00023497,TCONS_00024996,TCONS_00023644 ...
## 15 21__21       59 TCONS_00023732,TCONS_00056554,TCONS_00055546 ...
## 16 21__23       63 TCONS_00024739,TCONS_00025412,TCONS_00024226 ...
## 17 22__15       32 TCONS_00055259,TCONS_00056618,TCONS_00010938 ...
## 18 22__21       50 TCONS_00056618,TCONS_00055613,TCONS_00056881 ...
## 19 22__23       41 TCONS_00055613,TCONS_00055692,TCONS_00056881 ...
## 20 22__24       74 TCONS_00024615,TCONS_00024668,TCONS_00055259 ...

The gene age enrichment and functional category enrichment of these genes:

PFLA_SPUR_COMPARISON$orthology_overlap_modules$
  genes_in_common_fams$commonfams$
    age_a_common$heatmap +
PFLA_SPUR_COMPARISON$orthology_overlap_modules$
  genes_in_common_fams$commonfams$
    cog_a_comon$heatmap

And the GO terms of these genes:

# Add a new column with the name of the original data frame
keygenes_commonfams_GO <- 
  bind_rows(
    PFLA_SPUR_COMPARISON$orthology_overlap_modules$
      genes_in_common_fams$commonfams$go_a_common$GOtable, 
    .id = "pair_gfams"
    )

# Rename the new column
colnames(keygenes_commonfams_GO)[1] <- "pair_gfams"

keygenes_commonfams_GO$logp <- 
  -log10(as.numeric(keygenes_commonfams_GO$classicFisher))

keygenes_commonfams_GO$logp[is.na(keygenes_commonfams_GO$logp)] <- 
  max(keygenes_commonfams_GO$logp, na.rm = T)

keygenes_commonfams_GO$obsexp <- 
  as.numeric(keygenes_commonfams_GO$Significant/keygenes_commonfams_GO$Expected)

keygenes_commonfams_GO <- 
  keygenes_commonfams_GO[
    order(
      keygenes_commonfams_GO$pair_gfams,
      keygenes_commonfams_GO$Term
      ),
    ]

keygenes_commonfams_GO$Term <- 
  factor(
    keygenes_commonfams_GO$Term, 
    levels = unique(keygenes_commonfams_GO$Term)
    )

The plot of the GO enrichment analysis:

gg <- keygenes_commonfams_GO  %>% 
  ggplot(
    aes(x=pair_gfams, y = Term, size = Significant, color = logp)
    ) + geom_point() +
  theme_bw() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8 )
    )+ 
  theme(
    axis.text.y.left = element_text(hjust = 1, size = 8 ),
    legend.title=element_text(size=8 ),
    legend.text=element_text(size=8 )
    )+ 
  scale_size_continuous(
    name = "Genes affected",
    range = c(2,6)
    ) + scale_color_viridis("-log10(P-value)") +
  xlab("")+ylab("")
stages_pf_sp = 
  list(
    a = c("04_EG","04_EG"),
    b = c("03_MeBl_24h1","03_MeBl_24h2")
  )

pfla_spur_hicor_genes <- get_high_cor_genes(
  mat = PFLA_SPUR_COMPARISON$pairwise_correlations$js,
  a_o = PFLA_SPUR_COMPARISON$merged_data$a_o,
  b_o = PFLA_SPUR_COMPARISON$merged_data$b_o,
  stages = stages_pf_sp,
  o = PFLA_SPUR_COMPARISON$input$o
)
## [1] "Subsetting count matrices"
## [1] "Model fitting and top genes"
pfla_spur_hicor_genes_GOs <- 
  getGOs(
    genelist = 
      lapply(
        pfla_spur_hicor_genes$hicor_topgenes, 
        function(sub_list) {
          setNames(sub_list$top_genes$a, names(sub_list))
        }),
    gene_universe = a_universe,
    gene2GO = a_id2go,
    max_terms = 10
  )
## [1] "Starting analysis 1 of 2"
## 
## Building most specific GOs .....
##  ( 15540 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 15584 GO terms and 36018 relations. )
## 
## Annotating nodes ...............
##  ( 15464 genes annotated to the GO terms. )
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 2068 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## [1] "Starting analysis 2 of 2"
## 
## Building most specific GOs .....
##  ( 15540 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 15584 GO terms and 36018 relations. )
## 
## Annotating nodes ...............
##  ( 15464 genes annotated to the GO terms. )
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 2255 nontrivial nodes
##       parameters: 
##           test statistic: fisher
print("Common genes in Correlations (age)")
## [1] "Common genes in Correlations (age)"
pfla_spur_hicor_genes_age <- 
  lapply(
    lapply(
      pfla_spur_hicor_genes$hicor_topgenes,
      function(sub_list) {
        data.frame(
          id = a_universe,
          module = 
            ifelse(
              a_universe %in% sub_list$top_genes$a, "common","not_common"
            )
        )
      }
    ),
    function(x){
      gene_age_enrichment(
        x_modules = x,
        x_age = ga[ga$age %in% common_evo_nodes,]
      )
    }
  )


# Common genes, highly correlated
p1 <- plot_hicor_genes(
  scatter = pfla_spur_hicor_genes$hicor_topgenes[[1]]$plot_topgenes,
  GO_plot = pfla_spur_hicor_genes_GOs$GOplot[[1]],
  age_table = pfla_spur_hicor_genes_age[[1]]$AgeperModule,
  age_enrichemnt_hm = pfla_spur_hicor_genes_age[[1]]$heatmap
)
## No id variables; using all as measure variables
p2 <- plot_hicor_genes(
  scatter = pfla_spur_hicor_genes$hicor_topgenes[[2]]$plot_topgenes,
  GO_plot = pfla_spur_hicor_genes_GOs$GOplot[[2]],
  age_table = pfla_spur_hicor_genes_age[[2]]$AgeperModule,
  age_enrichemnt_hm = pfla_spur_hicor_genes_age[[2]]$heatmap
)
## No id variables; using all as measure variables
plot_grid(
  p1,
  p2,
  ncol = 1)

Alternative method for 1-to-1 orthologs: Blast best reciprocal hits

Comparison of 1-to-1 TFs using blast best RBHs

rbh_pfla_blan <- read.delim2("outputs/comparative/rbh/pfla_blan_rbh.tsv", header = FALSE)
rbh_pfla_blan[,2] <- gsub(" ","",rbh_pfla_blan[,2])
rbh_pfla_blan[,1] <- gsub("\\.p[0-9]+","",rbh_pfla_blan[,1])

rbh_pfla_spur <- read.delim2("outputs/comparative/rbh/pfla_spur_rbh.tsv", header = FALSE)
rbh_pfla_spur[,2] <- gsub(" ","",rbh_pfla_spur[,2])
rbh_pfla_spur[,1] <- gsub("\\.p[0-9]+","",rbh_pfla_spur[,1])

Ptychodera and Amphioxus

# Expression data, using all genes. No need for transformation
a = PFLA_BLAN_COMPARISON$input$a
b = PFLA_BLAN_COMPARISON$input$b

# Family/orthology data
o = unique(rbh_pfla_blan)

# MERGE
o = pair2id(o)

pf_bl_merge_ab <- mergedata(a,b,o)

# CORRELATIONS
pf_bl_rbh_cors_allgen <- rawcorsp(pf_bl_merge_ab$a_o,pf_bl_merge_ab$b_o) # FIX JSD

plot_cors(pf_bl_rbh_cors_allgen)

Ptychodera and Sea Urchin

# Expression data
a = PFLA_SPUR_COMPARISON$input$a
b = PFLA_SPUR_COMPARISON$input$b

# Family/orthology data
o = unique(rbh_pfla_spur)
o = pair2id(o)

# MERGE
pf_sp_merge_ab <- mergedata(a,b,o)

# CORRELATIONS
print("Correlations")
## [1] "Correlations"
pf_sp_rbh_cors_allgen <- rawcorsp(pf_sp_merge_ab$a_o,pf_sp_merge_ab$b_o) # FIX JSD

plot_cors(pf_sp_rbh_cors_allgen)

Saving the data

We will save these two objects for further analysis and reference:

save(
  PFLA_BLAN_COMPARISON,
  PFLA_SPUR_COMPARISON,
  file = "outputs/rda/species_comparison.rda"
)

Figures for the paper

These are just calls to PDFs for plotting the figures we have generated.